<!DOCTYPE html>



  


<html class="theme-next gemini use-motion" lang="zh-Hans">
<head><meta name="generator" content="Hexo 3.9.0">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">



  
  
    
    
  <script src="/dxl/lib/pace/pace.min.js?v=1.0.2"></script>
  <link href="/dxl/lib/pace/pace-theme-minimal.min.css?v=1.0.2" rel="stylesheet">







<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
















  
  
  <link href="/dxl/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">







<link href="/dxl/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">

<link href="/dxl/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">


  <link rel="apple-touch-icon" sizes="180x180" href="/dxl/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/dxl/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/dxl/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/dxl/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="爬虫,">










<meta name="description" content="一、爬虫基本知识1.1什么是爬虫1就是通过编写程序模拟浏览器上网,然后让其去互联网上抓取数据的过程  反爬虫 1门户网站通过相应的策略和技术手段，防止爬虫程序进行网站数据的爬取  反反爬虫 1爬虫程序通过相应的策略和技术手段，破解门户网站的反爬虫手段，从而爬取到相应的数据  1.2分类通用爬虫: 1把互联网上的所有的网页下载下来，放到本地服务器里形成备分，在对这些网页做相关处理(提取关键字、去掉广">
<meta name="keywords" content="爬虫">
<meta property="og:type" content="article">
<meta property="og:title" content="爬虫理论知识">
<meta property="og:url" content="http://yoursite.com/2019/09/20/【爬虫01】爬虫理论知识/index.html">
<meta property="og:site_name" content="我的快乐时光">
<meta property="og:description" content="一、爬虫基本知识1.1什么是爬虫1就是通过编写程序模拟浏览器上网,然后让其去互联网上抓取数据的过程  反爬虫 1门户网站通过相应的策略和技术手段，防止爬虫程序进行网站数据的爬取  反反爬虫 1爬虫程序通过相应的策略和技术手段，破解门户网站的反爬虫手段，从而爬取到相应的数据  1.2分类通用爬虫: 1把互联网上的所有的网页下载下来，放到本地服务器里形成备分，在对这些网页做相关处理(提取关键字、去掉广">
<meta property="og:locale" content="zh-Hans">
<meta property="og:updated_time" content="2019-09-20T12:26:55.745Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="爬虫理论知识">
<meta name="twitter:description" content="一、爬虫基本知识1.1什么是爬虫1就是通过编写程序模拟浏览器上网,然后让其去互联网上抓取数据的过程  反爬虫 1门户网站通过相应的策略和技术手段，防止爬虫程序进行网站数据的爬取  反反爬虫 1爬虫程序通过相应的策略和技术手段，破解门户网站的反爬虫手段，从而爬取到相应的数据  1.2分类通用爬虫: 1把互联网上的所有的网页下载下来，放到本地服务器里形成备分，在对这些网页做相关处理(提取关键字、去掉广">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/dxl/',
    scheme: 'Gemini',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":true,"scrollpercent":true,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: '博主'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://yoursite.com/2019/09/20/【爬虫01】爬虫理论知识/">





  <title>爬虫理论知识 | 我的快乐时光</title>
  








</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-Hans">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/dxl/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">我的快乐时光</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle"></p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/dxl/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br>
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/dxl/categories/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br>
            
            分类
          </a>
        </li>
      

      
    </ul>
  

  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://yoursite.com/dxl/2019/09/20/【爬虫01】爬虫理论知识/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content>
      <meta itemprop="description" content>
      <meta itemprop="image" content="/dxl/images/avatar.png">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="我的快乐时光">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">爬虫理论知识</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2019-09-20T20:44:46+08:00">
                2019-09-20
              </time>
            

            

            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/dxl/categories/爬虫/" itemprop="url" rel="index">
                    <span itemprop="name">爬虫</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
              <span class="post-comments-count">
                <span class="post-meta-divider">|</span>
                <span class="post-meta-item-icon">
                  <i class="fa fa-comment-o"></i>
                </span>
                <a href="/dxl/2019/09/20/【爬虫01】爬虫理论知识/#comments" itemprop="discussionUrl">
                  <span class="post-comments-count valine-comment-count" data-xid="/dxl/2019/09/20/【爬虫01】爬虫理论知识/" itemprop="commentCount"></span>
                </a>
              </span>
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="page-pv">本文总阅读量
            <span class="busuanzi-value" id="busuanzi_value_page_pv"></span>次
            </span>
          

          
            <div class="post-wordcount">
              
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <h2 id="一、爬虫基本知识"><a href="#一、爬虫基本知识" class="headerlink" title="一、爬虫基本知识"></a>一、爬虫基本知识</h2><h3 id="1-1什么是爬虫"><a href="#1-1什么是爬虫" class="headerlink" title="1.1什么是爬虫"></a>1.1什么是爬虫</h3><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">就是通过编写程序模拟浏览器上网,然后让其去互联网上抓取数据的过程</span><br></pre></td></tr></table></figure>

<p>反爬虫</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">门户网站通过相应的策略和技术手段，防止爬虫程序进行网站数据的爬取</span><br></pre></td></tr></table></figure>

<p>反反爬虫</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">爬虫程序通过相应的策略和技术手段，破解门户网站的反爬虫手段，从而爬取到相应的数据</span><br></pre></td></tr></table></figure>

<h3 id="1-2分类"><a href="#1-2分类" class="headerlink" title="1.2分类"></a>1.2分类</h3><p>通用爬虫:</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">把互联网上的所有的网页下载下来，放到本地服务器里形成备分，在对这些网页做相关处理(提取关键字、去掉广告)，最后提供一个用户检索接口。</span><br></pre></td></tr></table></figure>

<p>聚焦爬虫</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">聚焦爬虫是根据指定的需求抓取网络上指定的数据。例如：获取豆瓣上电影的名称和影评，而不是获取整张页面中所有的数据值。</span><br></pre></td></tr></table></figure>

<p>增量式爬虫:用于监测网站数据更新的情况,从而爬取网站中最新更新出来的数据.</p>
<h3 id="1-3-Anacanda"><a href="#1-3-Anacanda" class="headerlink" title="1.3 Anacanda"></a>1.3 Anacanda</h3><p>1、Anacanda:一款集成环境,集成的都是基于数据分析和机器学习的环境(模块)</p>
<p>​     upyter notebook:就是Anacanda提供的一款可视化的编码工具(基于浏览器)</p>
<p>2、快捷键</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">添加一个cell:a b</span><br><span class="line">删除cell:x</span><br><span class="line">双击:进入可编辑模式</span><br><span class="line">切换cell的模式:</span><br><span class="line">y:markdown-&gt;code</span><br><span class="line">m:code-&gt;markdown</span><br><span class="line">tab:</span><br><span class="line">执行cell:shift+enter</span><br><span class="line">打开帮助文档:shift+tab</span><br></pre></td></tr></table></figure>

<h3 id="1-4-http协议"><a href="#1-4-http协议" class="headerlink" title="1.4 http协议"></a>1.4 http协议</h3><p>1、定义：</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">就是客户端和服务端进行数据交互的形式</span><br></pre></td></tr></table></figure>

<p>2、常用头信息</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">User-Agent:请求载体的身份标识</span><br><span class="line">Connection:close</span><br><span class="line">content-type:</span><br></pre></td></tr></table></figure>

<p>3、https的加密形式</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">对称密钥加密:</span><br><span class="line">非对称密钥加密:</span><br><span class="line">证书密钥加密:</span><br></pre></td></tr></table></figure>

<h2 id="二、爬虫的基本模块"><a href="#二、爬虫的基本模块" class="headerlink" title="二、爬虫的基本模块"></a>二、爬虫的基本模块</h2><h3 id="2-1常用模块"><a href="#2-1常用模块" class="headerlink" title="2.1常用模块"></a>2.1常用模块</h3><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">requests:网络请求的模块,模拟浏览器发请求的. pip install requests</span><br><span class="line">urllib：看下博客&lt;发起请求&gt;</span><br></pre></td></tr></table></figure>

<h3 id="2-2爬虫流程"><a href="#2-2爬虫流程" class="headerlink" title="2.2爬虫流程"></a>2.2爬虫流程</h3><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">指定url ----&gt;发起请求-----&gt;获取相应数据-----&gt;持久化储存</span><br></pre></td></tr></table></figure>

<h2 id="三、爬虫的反扒机制与反反扒机制"><a href="#三、爬虫的反扒机制与反反扒机制" class="headerlink" title="三、爬虫的反扒机制与反反扒机制"></a>三、爬虫的反扒机制与反反扒机制</h2><h3 id="2-1-robots-txt协议"><a href="#2-1-robots-txt协议" class="headerlink" title="2.1 robots.txt协议"></a>2.1 robots.txt协议</h3><p>​    只是一个协议，可以不遵守</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br><span class="line">95</span><br><span class="line">96</span><br><span class="line">97</span><br><span class="line">98</span><br><span class="line">99</span><br><span class="line">100</span><br><span class="line">101</span><br><span class="line">102</span><br><span class="line">103</span><br><span class="line">104</span><br><span class="line">105</span><br><span class="line">106</span><br><span class="line">107</span><br><span class="line">108</span><br><span class="line">109</span><br><span class="line">110</span><br><span class="line">111</span><br><span class="line">112</span><br><span class="line">113</span><br><span class="line">114</span><br><span class="line">115</span><br><span class="line">116</span><br><span class="line">117</span><br><span class="line">118</span><br><span class="line">119</span><br><span class="line">120</span><br><span class="line">121</span><br><span class="line">122</span><br><span class="line">123</span><br><span class="line">124</span><br><span class="line">125</span><br><span class="line">126</span><br><span class="line">127</span><br><span class="line">128</span><br><span class="line">129</span><br><span class="line">130</span><br><span class="line">131</span><br><span class="line">132</span><br><span class="line">133</span><br><span class="line">134</span><br><span class="line">135</span><br><span class="line">136</span><br><span class="line">137</span><br><span class="line">138</span><br><span class="line">139</span><br><span class="line">140</span><br><span class="line">141</span><br><span class="line">142</span><br><span class="line">143</span><br><span class="line">144</span><br><span class="line">145</span><br><span class="line">146</span><br><span class="line">147</span><br><span class="line">148</span><br><span class="line">149</span><br><span class="line">150</span><br></pre></td><td class="code"><pre><span class="line">User-agent: Baiduspider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Googlebot</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: MSNBot</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Baiduspider-image</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: YoudaoBot</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sogou web spider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sogou inst spider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sogou spider2</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sogou blog</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sogou News Spider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sogou Orion spider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: ChinasoSpider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: Sosospider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">User-agent: yisouspider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: EasouSpider</span><br><span class="line">Disallow: /baidu</span><br><span class="line">Disallow: /s?</span><br><span class="line">Disallow: /shifen/</span><br><span class="line">Disallow: /homepage/</span><br><span class="line">Disallow: /cpro</span><br><span class="line">Disallow: /ulink?</span><br><span class="line">Disallow: /link?</span><br><span class="line">Disallow: /home/news/data/</span><br><span class="line"></span><br><span class="line">User-agent: *</span><br><span class="line">Disallow: /</span><br></pre></td></tr></table></figure>


      
    </div>
    
    
    

    

    

    
      <div>
        <ul class="post-copyright">
  <li class="post-copyright-author">
    <strong>本文作者：</strong>
    
  </li>
  <li class="post-copyright-link">
    <strong>本文链接：</strong>
    <a href="http://yoursite.com/2019/09/20/【爬虫01】爬虫理论知识/" title="爬虫理论知识">http://yoursite.com/2019/09/20/【爬虫01】爬虫理论知识/</a>
  </li>
  <li class="post-copyright-license">
    <strong>版权声明： </strong>
    本博客所有文章除特别声明外，均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/3.0/" rel="external nofollow" target="_blank">CC BY-NC-SA 3.0</a> 许可协议。转载请注明出处！
  </li>
</ul>

      </div>
    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/dxl/tags/爬虫/" rel="tag"># 爬虫</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/dxl/2019/09/20/【爬虫00】jupyter的使用/" rel="next" title="jupyter的使用">
                <i class="fa fa-chevron-left"></i> jupyter的使用
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/dxl/2019/09/20/【爬虫02】0爬取数据常用模块 /" rel="prev" title="爬虫常用模块一">
                爬虫常用模块一 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  
    <div class="comments" id="comments">
    </div>
  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image" src="/dxl/images/avatar.png" alt>
            
              <p class="site-author-name" itemprop="name"></p>
              <p class="site-description motion-element" itemprop="description"></p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/dxl/archives">
              
                  <span class="site-state-item-count">43</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/dxl/categories/index.html">
                  <span class="site-state-item-count">6</span>
                  <span class="site-state-item-name">分类</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/dxl/tags/index.html">
                  <span class="site-state-item-count">6</span>
                  <span class="site-state-item-name">标签</span>
                </a>
              </div>
            

          </nav>

          

          

          
          

          
          
            <div class="links-of-blogroll motion-element links-of-blogroll-inline">
              <div class="links-of-blogroll-title">
                <i class="fa  fa-fw fa-sign-out"></i>
                我的友链
              </div>
              <ul class="links-of-blogroll-list">
                
                  <li class="links-of-blogroll-item">
                    <a href="tencent://message/?Menu=yes&uin=1258517737&Site=QQ%E6%9E%81%E5%AE%A2&Service=300&sigT=45a1e5847943b64c6ff3990f8a9e644d2b31356cb0b4ac6b24663a3c8dd0f8aa12a595b1714f9d45/" title="申请坑位" target="_blank">申请坑位</a>
                  </li>
                
              </ul>
            </div>
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#一、爬虫基本知识"><span class="nav-number">1.</span> <span class="nav-text">一、爬虫基本知识</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#1-1什么是爬虫"><span class="nav-number">1.1.</span> <span class="nav-text">1.1什么是爬虫</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#1-2分类"><span class="nav-number">1.2.</span> <span class="nav-text">1.2分类</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#1-3-Anacanda"><span class="nav-number">1.3.</span> <span class="nav-text">1.3 Anacanda</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#1-4-http协议"><span class="nav-number">1.4.</span> <span class="nav-text">1.4 http协议</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#二、爬虫的基本模块"><span class="nav-number">2.</span> <span class="nav-text">二、爬虫的基本模块</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#2-1常用模块"><span class="nav-number">2.1.</span> <span class="nav-text">2.1常用模块</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#2-2爬虫流程"><span class="nav-number">2.2.</span> <span class="nav-text">2.2爬虫流程</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#三、爬虫的反扒机制与反反扒机制"><span class="nav-number">3.</span> <span class="nav-text">三、爬虫的反扒机制与反反扒机制</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#2-1-robots-txt协议"><span class="nav-number">3.1.</span> <span class="nav-text">2.1 robots.txt协议</span></a></li></ol></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      
        <div class="back-to-top">
          <i class="fa fa-arrow-up"></i>
          
            <span id="scrollpercent"><span>0</span>%</span>
          
        </div>
      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; <span itemprop="copyrightYear">2019</span>
  <span class="with-love">
    <i class="fa fa-hand-peace-o"></i>
  </span>
  <span class="author" itemprop="copyrightHolder"></span>

  
</div>









        
<div class="busuanzi-count">
  <script async src="https://dn-lbstatics.qbox.me/busuanzi/2.3/busuanzi.pure.mini.js"></script>

  
    <span class="site-uv">
      本站访客数
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
      人次
    </span>
  

  
    <span class="site-pv">
      本站总访问量
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
      次
    </span>
  
</div>








        
      </div>
    </footer>

    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/dxl/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/dxl/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/dxl/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/dxl/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/dxl/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/dxl/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/dxl/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/dxl/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/dxl/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/dxl/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/dxl/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/dxl/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/dxl/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  










  <script src="//cdn1.lncld.net/static/js/3.0.4/av-min.js"></script>
  <script src="//unpkg.com/valine/dist/Valine.min.js"></script>
  
  <script type="text/javascript">
    var GUEST = ['nick','mail','link'];
    var guest = 'nick,mail,link';
    guest = guest.split(',').filter(item=>{
      return GUEST.indexOf(item)>-1;
    });
    new Valine({
        el: '#comments' ,
        verify: false,
        notify: false,
        appId: '13B0JGDuA6ttduN8AQaR8CzF-gzGzoHsz',
        appKey: 'I13r9r5mVgq4jQYpYy6V4gW3',
        placeholder: '欢迎大佬指点~~~',
        avatar:'mm',
        guest_info:guest,
        pageSize:'10' || 10,
    });
  </script>



  





  

  

  

  
  

  

  

  

</body>
</html>
